In [1]:
import os
import time
import math
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
from pandarallel import pandarallel
import requests
import sys
import nltk
from textblob import TextBlob
from wordcloud import WordCloud
from google.cloud import storage
from textblob.sentiments import NaiveBayesAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
from collections import Counter
import concurrent.futures
import warnings
warnings.simplefilter('once')
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)
In [3]:
num_processors = multiprocessing.cpu_count()
num_processors
workers = num_processors-1
print(f'Using {workers} workers')
Using 15 workers
In [4]:
pandarallel.initialize(nb_workers=workers, use_memory_fs=False, progress_bar=True)
INFO: Pandarallel will run on 15 workers. INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
1. Import Data¶
In [5]:
%%time
file_path = 'news_cleaned.parquet'
news = pd.read_parquet(file_path)
CPU times: user 19.8 s, sys: 29.5 s, total: 49.2 s Wall time: 36.9 s
In [6]:
news.shape # (198064, 16)
Out[6]:
(198064, 16)
In [7]:
news.columns
Out[7]:
Index(['url', 'date', 'language', 'title', 'text', 'year', 'month', 'day',
'text_ner', 'text_cleaned', 'text_lemm', 'title_ner', 'title_cleaned',
'title_lemm', 'title_word_count', 'text_word_count'],
dtype='object')
In [8]:
news.sample(1, random_state = 42)[['text_ner', 'text_cleaned', 'text_lemm', 'title_ner', 'title_cleaned', 'title_lemm']]
Out[8]:
| text_ner | text_cleaned | text_lemm | title_ner | title_cleaned | title_lemm | |
|---|---|---|---|---|---|---|
| 196666 | Prosecutors in all states urge Congress to strengthen tools to fight AI child sexual abuse images Skip to contentCommunity Coverage TourHome ProMedically SpeakingBest of the WestChampions in AgBack to Our AppsCOVID 19Food for NewsTexasNew to a TipLatest CamsClosings and DelaysSend Us Your Weather PhotosTxDOT Highway ConditionsDownload the Weather AppWeather ResourcesKCBD InvestigatesSubmit a TipChad Read ShootingReagor Dykes CoverageSex Trafficking on the South PlainsLubbock County Medical E... | prosecutors states urge congress strengthen tools fight ai child sexual abuse images skip contentcommunity coverage tourhome promedically speakingbest westchampions agback appscovid newstexasnew tiplatest camsclosings delayssend us weather photostxdot highway conditionsdownload weather appweather resourceskcbd investigatessubmit tipchad read shootingreagor dykes coveragesex trafficking south plainslubbock county medical examiner school beat petestats predictionshow watchcommunitytell somethi... | prosecutor state urge congress strengthen tool fight ai child sexual abuse image skip contentcommunity coverage tourhome promedically speakingbest westchampions agback appscovid newstexasnew tiplatest camsclosings delayssend u weather photostxdot highway conditionsdownload weather appweather resourceskcbd investigatessubmit tipchad read shootingreagor dyke coveragesex traffic south plainslubbock county medical examiner school beat petestats predictionshow watchcommunitytell something goodnot... | Prosecutors in all states urge Congress to strengthen tools to fight AI child sexual abuse images | prosecutors states urge congress strengthen tools fight ai child sexual abuse images | prosecutor state urge congress strengthen tool fight ai child sexual abuse image |
2. Sentiment Analysis with TextBlob: Polarity and Subjectivity¶
textblob.sentiments module contains two sentiment analysis implementations
- PatternAnalyzer (based on the pattern library: https://www.clips.uantwerpen.be/pattern)
- NaiveBayesAnalyzer (an NLTK classifier trained on a movie reviews corpus).
The default implementation is PatternAnalyzer, but you can override the analyzer to use NaiveBayesAnalyzer
Polarity and Subjectivity:
- Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement
- Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1]
In [9]:
# Function to analyze sentiment and categorize based on polarity
def analyze_and_categorize_sentiment(text):
sentiment = TextBlob(text).sentiment
polarity = sentiment.polarity
subjectivity = sentiment.subjectivity
if polarity > 0:
sentiment_label = 'positive'
elif polarity < 0:
sentiment_label = 'negative'
else:
sentiment_label = 'neutral'
return sentiment_label, polarity, subjectivity
In [10]:
%%time
# Apply the function in parallel
results = news['text_cleaned'].parallel_apply(analyze_and_categorize_sentiment)
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13205), Label(value='0 / 13205')))…
CPU times: user 9.08 s, sys: 5.81 s, total: 14.9 s Wall time: 1min 48s
In [11]:
# Create a DataFrame from the results
df_sentiments_textblob = pd.DataFrame(results.tolist(), columns=['tblob_sent', 'tblob_score', 'tblob_sub'])
In [12]:
news = news.join(df_sentiments_textblob)
In [13]:
news[news['tblob_sent'] == 'positive'][['text_ner', 'tblob_sent', 'tblob_score', 'tblob_sub']].sample(3, random_state = 42)
Out[13]:
| text_ner | tblob_sent | tblob_score | tblob_sub | |
|---|---|---|---|---|
| 4571 | Europe s bid for AI standard faces long road, EU lawmakers say NewsBreakSign ArtTV SeriesBooks DanceBehind Viral VideosPerforming ArtsTV MusicHip. HealthHealth ServicesMental HealthDiseases s HealthCancerFood SportsPremier DrinksPetsBeauty SafetyPublic SafetyAccidentsLaw EnforcementTraffic AdviceFamily RentLabor IssuesTrouble ScienceEarth NationsMiddle locations, channels, topics, people ... inReuters Follow321K Followers285K Post146M ViewsABOUTReuters provides award winning coverage of the ... | positive | 0.053655 | 0.285548 |
| 111661 | ChatGPT app for smartphones now available in India Know how to download DH Latest News, DH NEWS, Latest News, NEWS, Technology, Apple iPhone X, iPhone users, ChatGPT, chatbot ChatGPT, creators ChatGPT, Aplle iPhone Sunday, May Breaking ChatGPT app for smartphones now available in India Know how to download Public sector bank hikes interest rates on fixed deposits The house where Nazi dictator Hitler was born to be converted into a human rights training centre Here is the timing, schedule and... | positive | 0.193479 | 0.355669 |
| 124068 | MOMENTUM GLOBAL INVESTMENT MANAGEMENT ANNOUNCES STRATEGIC PARTNERSHIP WITH MDOTM LTD TO DEVELOP ARTIFICIAL INTELLIGENCE AI CAPABILITIES AND INSIGHTS DRIVEN INVESTMENT SOLUTIONS Skip to contentNewsElection ClipsLive and CamsClosings DelaysFish Game ForecastFirst Alert Weather ClassesSportsSports ConnectionFootball Friday TopsPrepSpinSign Up for eNewsJob WatchContestsVideo ClipsLive StreamLatest ScheduleContact UsMeet the News TeamAdvertise with UsSubmit a StorySubmit Photo or VideoSubmit Birt... | positive | 0.049083 | 0.403997 |
In [14]:
news[news['tblob_sent'] == 'negative'][['text_ner', 'tblob_sent', 'tblob_score', 'tblob_sub']].sample(3, random_state = 42)
Out[14]:
| text_ner | tblob_sent | tblob_score | tblob_sub | |
|---|---|---|---|---|
| 110487 | Snag a Lifetime Subscription to Jott Pro AI Text Speech Toolkit for a Massive OffThe InventoryThe A.V. RootThe TakeoutThe OnionIt s all of the DayBest Amazon DealsKinja GoodsWe may earn a commission from links on this page.It s all of the DayBest Amazon DealsKinja GoodsTechSnag a Lifetime Subscription to Jott Pro AI Text Speech Toolkit for a Massive OffState of the art, AI driven tech will streamline your workflow, make you more efficient and reduce human error.ByWilliam HelmsPublished6 minu... | negative | -0.076507 | 0.661994 |
| 158858 | Artificial Intelligence in Retail Market is Enhanced by Inception of Exponential Technologies such as Sensors, Robotics, Virtual Reality Jewish Market Reports Skip to content Saturday, April, Contact Jewish Market Reports Jewish Market Research News Market Sales Industry Analysis Market Size Market Report Market Outlook Industry Growth Contact You are hereHomeNews2020 Artificial Intelligence in Retail Market is Enhanced by Inception of Exponential Technologies such as Sensors, Robotics, Virt... | negative | -0.075549 | 0.538839 |
| 191646 | Artificial Intelligence Market Promising Growth Opportunities over TechNews.mobi Market Reports Skip to content TechNews.mobi Market Reports Reporting about the Technology Market Space Electric News NASA Satellite Climate Market Forecast Industry Analysis Market Reports Contact Us Space Electric News NASA Satellite Climate Market Forecast Industry Analysis Market Reports Contact Us Artificial Intelligence Market Promising Growth Opportunities over By email protected Published February, All N... | negative | -0.030757 | 0.475267 |
In [15]:
news.isnull().sum()
Out[15]:
url 0 date 0 language 0 title 0 text 0 year 0 month 0 day 0 text_ner 0 text_cleaned 0 text_lemm 0 title_ner 0 title_cleaned 0 title_lemm 0 title_word_count 0 text_word_count 0 tblob_sent 0 tblob_score 0 tblob_sub 0 dtype: int64
In [16]:
news.to_parquet('news_tblob_sent.parquet')
In [17]:
# Google Cloud Storage details
bucket_name = 'nlp-final'
file_path = 'news_tblob_sent.parquet' # This is the name the file will have in GCS
local_file_path = 'news_tblob_sent.parquet' # Path to the local file you just saved
# Create a GCS Client
storage_client = storage.Client()
# Get the bucket
bucket = storage_client.get_bucket(bucket_name)
# Create a blob object from the filepath
blob = bucket.blob(file_path)
# Upload the file
blob.upload_from_filename(local_file_path)
3-(A). Sentiment over time: Polarity Score¶
3.1. Overall Sentiment (Average of Sentiment from Positive and Negative)¶
1. Sentiment Distribution¶
In [18]:
sentiment_counts = news['tblob_sent'].value_counts(ascending=False).reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']
sentiment_counts
Out[18]:
| Sentiment | Count | |
|---|---|---|
| 0 | positive | 183719 |
| 1 | negative | 13710 |
| 2 | neutral | 635 |
In [19]:
# Create a bar plot
plt.figure(figsize=(7, 5))
sns.barplot(x='Sentiment', y='Count', data=sentiment_counts)
# Adding title and labels
plt.title('Sentiment Distribution from tblob Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')
# Show the plot
plt.show()
In [20]:
abs_tblob_score = abs(news['tblob_score'])
In [21]:
abs_tblob_score.describe()
Out[21]:
count 198064.000000 mean 0.112337 std 0.068655 min 0.000000 25% 0.065693 50% 0.105343 75% 0.148454 max 1.000000 Name: tblob_score, dtype: float64
In [22]:
# Create a distplot
plt.figure(figsize=(7, 5)) # Set the size of the plot
sns.distplot(abs_tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})
# Customize the plot
plt.title('Distribution of Absolute tblob Polarity Scores')
plt.xlabel('Absolute Compound Score')
plt.ylabel('Density')
plt.show()
In [23]:
tblob_score = news['tblob_score']
In [24]:
tblob_score.describe()
Out[24]:
count 198064.000000 mean 0.104398 std 0.080214 min -1.000000 25% 0.061382 50% 0.103688 75% 0.147511 max 1.000000 Name: tblob_score, dtype: float64
In [25]:
# Create a distplot
plt.figure(figsize=(7, 5)) # Set the size of the plot
sns.distplot(tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})
# Customize the plot
plt.title('Distribution of tblob Polarity Scores')
plt.xlabel('Absolute Compound Score')
plt.ylabel('Density')
plt.show()
2. Sentiment Overtime¶
Year¶
In [26]:
# Group by year and month, and calculate the average sentiment score for each month
yearly_sentiment = news.groupby('year')['tblob_score'].mean().reset_index()
yearly_sentiment.columns = ['Year', 'Average_Sentiment']
In [27]:
yearly_sentiment.head()
Out[27]:
| Year | Average_Sentiment | |
|---|---|---|
| 0 | 2020 | 0.077761 |
| 1 | 2021 | 0.090981 |
| 2 | 2022 | 0.114463 |
| 3 | 2023 | 0.110122 |
In [28]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_sentiment, x='Year', y='Average_Sentiment', marker='o')
# Customize the plot
plt.title('Yearly Average Sentiment Trend', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(yearly_sentiment['Year']) # Ensure all years are shown as x-ticks
# Show the plot
plt.show()
In [29]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news, x='year', y='tblob_score', marker='o')
# Customize the plot
plt.title('Yearly Average Sentiment Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
Month¶
In [30]:
# Group by year and month, and calculate the average sentiment score for each month
monthly_sentiment = news.groupby(['year', 'month'])['tblob_score'].mean().reset_index()
monthly_sentiment.columns = ['Year', 'Month', 'Average_Sentiment']
In [31]:
monthly_sentiment.head()
Out[31]:
| Year | Month | Average_Sentiment | |
|---|---|---|---|
| 0 | 2020 | 1 | 0.085299 |
| 1 | 2020 | 2 | 0.073580 |
| 2 | 2020 | 3 | 0.060021 |
| 3 | 2020 | 4 | 0.061634 |
| 4 | 2020 | 5 | 0.078995 |
In [32]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Create a larger figure size to prevent overlapping
plt.figure(figsize=(20, 10))
# Create a line plot with the custom color palette
sns.lineplot(x='Month', y='Average_Sentiment', hue='Year', data=monthly_sentiment, marker='o', palette=custom_colors)
# Add titles and labels
plt.title('Monthly Average Sentiment Over Time', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) # Month labels from 1 to 12
# Move the legend outside of the plot
plt.legend(title='Year', bbox_to_anchor=(1.02, 1.02), loc='upper left')
# Adjust subplot parameters for better layout
# plt.subplots_adjust(right=0.8)
# Show the plot
plt.show()
In [33]:
monthly_sentiment['Year_Month'] = monthly_sentiment['Year'].astype(str).str.zfill(2) + '-' + monthly_sentiment['Month'].astype(str).str.zfill(2)
In [34]:
monthly_sentiment.head()
Out[34]:
| Year | Month | Average_Sentiment | Year_Month | |
|---|---|---|---|---|
| 0 | 2020 | 1 | 0.085299 | 2020-01 |
| 1 | 2020 | 2 | 0.073580 | 2020-02 |
| 2 | 2020 | 3 | 0.060021 | 2020-03 |
| 3 | 2020 | 4 | 0.061634 | 2020-04 |
| 4 | 2020 | 5 | 0.078995 | 2020-05 |
In [35]:
# Convert 'Year_Month' to a datetime format for better plotting
monthly_sentiment['Year_Month'] = pd.to_datetime(monthly_sentiment['Year_Month'])
In [36]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=monthly_sentiment, x='Year_Month', y='Average_Sentiment', marker='o')
# Customize the plot
plt.title('Monthly Average Sentiment Over Time', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
In [37]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news, x='month', y='tblob_score', marker='o')
# Customize the plot
plt.title('Monthly Average Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
Day¶
In [38]:
daily_sentiment = news.groupby(['year', 'month', 'day'])['tblob_score'].mean().reset_index()
daily_sentiment.columns = ['Year', 'Month', 'Day', 'Average_Sentiment']
In [39]:
daily_sentiment.head()
Out[39]:
| Year | Month | Day | Average_Sentiment | |
|---|---|---|---|---|
| 0 | 2020 | 1 | 1 | 0.087578 |
| 1 | 2020 | 1 | 2 | 0.099098 |
| 2 | 2020 | 1 | 3 | 0.111516 |
| 3 | 2020 | 1 | 4 | 0.118009 |
| 4 | 2020 | 1 | 5 | 0.117570 |
In [40]:
daily_sentiment['Month_Day'] = daily_sentiment['Month'].astype(str).str.zfill(2) + '-' + daily_sentiment['Day'].astype(str).str.zfill(2)
In [41]:
daily_sentiment.head()
Out[41]:
| Year | Month | Day | Average_Sentiment | Month_Day | |
|---|---|---|---|---|---|
| 0 | 2020 | 1 | 1 | 0.087578 | 01-01 |
| 1 | 2020 | 1 | 2 | 0.099098 | 01-02 |
| 2 | 2020 | 1 | 3 | 0.111516 | 01-03 |
| 3 | 2020 | 1 | 4 | 0.118009 | 01-04 |
| 4 | 2020 | 1 | 5 | 0.117570 | 01-05 |
In [42]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Set the style to white (no grid)
sns.set(style="white")
# Create a line plot with a larger figure size
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment, x='Month_Day', y='Average_Sentiment', hue='Year', palette=custom_colors)
# Customize the plot
plt.title('Daily Average Sentiment Trend By Year', fontsize=16)
plt.xlabel('Month-Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# Improve x-tick readability
# Show only the first day of each month or every few days
x_ticks = daily_sentiment['Month_Day'].unique()[::10] # Adjust the step as needed
plt.xticks(x_ticks, rotation=90) # Rotate x-ticks for better readability
# Place the legend outside the plot
plt.legend(title='Year', bbox_to_anchor=(1.01, 1.01), loc='upper left')
# Adjust subplot parameters for better layout
plt.subplots_adjust(right=0.8)
# Show the plot
plt.show()
In [43]:
daily_sentiment2 = news.groupby('date')['tblob_score'].mean().reset_index()
daily_sentiment2.columns = ['Date', 'Average_Sentiment']
In [44]:
daily_sentiment2.head()
Out[44]:
| Date | Average_Sentiment | |
|---|---|---|
| 0 | 2020-01-01 | 0.087578 |
| 1 | 2020-01-02 | 0.099098 |
| 2 | 2020-01-03 | 0.111516 |
| 3 | 2020-01-04 | 0.118009 |
| 4 | 2020-01-05 | 0.117570 |
In [45]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment2, x='Date', y='Average_Sentiment')
# Customize the plot
plt.title('Daily Average Sentiment Over Time', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
x_ticks = daily_sentiment2['Date'].unique()[::30] # Adjust the step as needed
plt.xticks(x_ticks, rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
In [46]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news, x='day', y='tblob_score', marker='o')
# Customize the plot
plt.title('Daily Average Sentiment', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
3.2. Positive Sentiment (Average of Sentiment from Positive)¶
1. Sentiment Distribution¶
In [47]:
news_po = news[news['tblob_sent'] == 'positive']
In [48]:
po_tblob_score = news_po['tblob_score']
In [49]:
po_tblob_score.describe()
Out[49]:
count 1.837190e+05 mean 1.168289e-01 std 6.668608e-02 min 2.220446e-18 25% 7.167363e-02 50% 1.094301e-01 75% 1.515162e-01 max 1.000000e+00 Name: tblob_score, dtype: float64
In [50]:
# Create a distplot
plt.figure(figsize=(7, 5)) # Set the size of the plot
sns.distplot(po_tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})
# Customize the plot
plt.title('Distribution of tblob Polarity Scores from Positive Sentiment')
plt.xlabel('Compound Score')
plt.ylabel('Density')
plt.show()
2. Sentiment Overtime¶
Year¶
In [51]:
# Group by year and month, and calculate the average sentiment score for each month
yearly_sentiment = news_po.groupby('year')['tblob_score'].mean().reset_index()
yearly_sentiment.columns = ['Year', 'Average_Sentiment']
In [52]:
yearly_sentiment.head()
Out[52]:
| Year | Average_Sentiment | |
|---|---|---|
| 0 | 2020 | 0.107211 |
| 1 | 2021 | 0.111604 |
| 2 | 2022 | 0.124247 |
| 3 | 2023 | 0.117383 |
In [53]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_sentiment, x='Year', y='Average_Sentiment', marker='o')
# Customize the plot
plt.title('Yearly Average Sentiment Trend from Positive Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(yearly_sentiment['Year']) # Ensure all years are shown as x-ticks
# Show the plot
plt.show()
In [54]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='year', y='tblob_score', marker='o')
# Customize the plot
plt.title('Yearly Average Sentiment Over Time from Positive Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
Month¶
In [55]:
# Group by year and month, and calculate the average sentiment score for each month
monthly_sentiment = news_po.groupby(['year', 'month'])['tblob_score'].mean().reset_index()
monthly_sentiment.columns = ['Year', 'Month', 'Average_Sentiment']
In [56]:
monthly_sentiment.head()
Out[56]:
| Year | Month | Average_Sentiment | |
|---|---|---|---|
| 0 | 2020 | 1 | 0.106327 |
| 1 | 2020 | 2 | 0.101878 |
| 2 | 2020 | 3 | 0.101978 |
| 3 | 2020 | 4 | 0.103658 |
| 4 | 2020 | 5 | 0.107567 |
In [57]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Create a larger figure size to prevent overlapping
plt.figure(figsize=(20, 10))
# Create a line plot with the custom color palette
sns.lineplot(x='Month', y='Average_Sentiment', hue='Year', data=monthly_sentiment, marker='o', palette=custom_colors)
# Add titles and labels
plt.title('Monthly Average Sentiment by Year from Positive Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) # Month labels from 1 to 12
# Move the legend outside of the plot
plt.legend(title='Year', bbox_to_anchor=(1.02, 1.02), loc='upper left')
# Adjust subplot parameters for better layout
# plt.subplots_adjust(right=0.8)
# Show the plot
plt.show()
In [58]:
monthly_sentiment['Year_Month'] = monthly_sentiment['Year'].astype(str).str.zfill(2) + '-' + monthly_sentiment['Month'].astype(str).str.zfill(2)
In [59]:
monthly_sentiment.head()
Out[59]:
| Year | Month | Average_Sentiment | Year_Month | |
|---|---|---|---|---|
| 0 | 2020 | 1 | 0.106327 | 2020-01 |
| 1 | 2020 | 2 | 0.101878 | 2020-02 |
| 2 | 2020 | 3 | 0.101978 | 2020-03 |
| 3 | 2020 | 4 | 0.103658 | 2020-04 |
| 4 | 2020 | 5 | 0.107567 | 2020-05 |
In [60]:
# Convert 'Year_Month' to a datetime format for better plotting
monthly_sentiment['Year_Month'] = pd.to_datetime(monthly_sentiment['Year_Month'])
In [61]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=monthly_sentiment, x='Year_Month', y='Average_Sentiment', marker='o')
# Customize the plot
plt.title('Monthly Average Sentiment Over Time from Positive Sentiment', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
In [62]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='month', y='tblob_score', marker='o')
# Customize the plot
plt.title('Monthly Average Sentiment from Positive Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
Day¶
In [63]:
daily_sentiment = news_po.groupby(['year', 'month', 'day'])['tblob_score'].mean().reset_index()
daily_sentiment.columns = ['Year', 'Month', 'Day', 'Average_Sentiment']
In [64]:
daily_sentiment.head()
Out[64]:
| Year | Month | Day | Average_Sentiment | |
|---|---|---|---|---|
| 0 | 2020 | 1 | 1 | 0.094981 |
| 1 | 2020 | 1 | 2 | 0.125209 |
| 2 | 2020 | 1 | 3 | 0.128022 |
| 3 | 2020 | 1 | 4 | 0.128935 |
| 4 | 2020 | 1 | 5 | 0.127711 |
In [65]:
daily_sentiment['Month_Day'] = daily_sentiment['Month'].astype(str).str.zfill(2) + '-' + daily_sentiment['Day'].astype(str).str.zfill(2)
In [66]:
daily_sentiment.head()
Out[66]:
| Year | Month | Day | Average_Sentiment | Month_Day | |
|---|---|---|---|---|---|
| 0 | 2020 | 1 | 1 | 0.094981 | 01-01 |
| 1 | 2020 | 1 | 2 | 0.125209 | 01-02 |
| 2 | 2020 | 1 | 3 | 0.128022 | 01-03 |
| 3 | 2020 | 1 | 4 | 0.128935 | 01-04 |
| 4 | 2020 | 1 | 5 | 0.127711 | 01-05 |
In [67]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Set the style to white (no grid)
sns.set(style="white")
# Create a line plot with a larger figure size
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment, x='Month_Day', y='Average_Sentiment', hue='Year', palette=custom_colors)
# Customize the plot
plt.title('Daily Average Sentiment Trend by Year from Positive Sentiment', fontsize=16)
plt.xlabel('Month-Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# Improve x-tick readability
# Show only the first day of each month or every few days
x_ticks = daily_sentiment['Month_Day'].unique()[::10] # Adjust the step as needed
plt.xticks(x_ticks, rotation=90) # Rotate x-ticks for better readability
# Place the legend outside the plot
plt.legend(title='Year', bbox_to_anchor=(1.01, 1.01), loc='upper left')
# Adjust subplot parameters for better layout
plt.subplots_adjust(right=0.8)
# Show the plot
plt.show()
In [68]:
daily_sentiment2 = news_po.groupby('date')['tblob_score'].mean().reset_index()
daily_sentiment2.columns = ['Date', 'Average_Sentiment']
In [69]:
daily_sentiment2.head()
Out[69]:
| Date | Average_Sentiment | |
|---|---|---|
| 0 | 2020-01-01 | 0.094981 |
| 1 | 2020-01-02 | 0.125209 |
| 2 | 2020-01-03 | 0.128022 |
| 3 | 2020-01-04 | 0.128935 |
| 4 | 2020-01-05 | 0.127711 |
In [70]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment2, x='Date', y='Average_Sentiment')
# Customize the plot
plt.title('Daily Average Sentiment Over Time from Positive Sentiment', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
x_ticks = daily_sentiment2['Date'].unique()[::30] # Adjust the step as needed
plt.xticks(x_ticks, rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
In [71]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='day', y='tblob_score', marker='o')
# Customize the plot
plt.title('Daily Average Sentiment from Positive Sentiment', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
3.2. Positive Sentiment (Average of Sentiment from Positive)¶
1. Sentiment Distribution¶
In [72]:
news_ne = news[news['tblob_sent'] == 'negative']
In [73]:
ne_tblob_score = news_ne['tblob_score']
In [74]:
ne_tblob_score.describe()
Out[74]:
count 1.371000e+04 mean -5.734629e-02 std 6.800852e-02 min -1.000000e+00 25% -7.720708e-02 50% -3.826780e-02 75% -1.558900e-02 max -2.523234e-18 Name: tblob_score, dtype: float64
In [75]:
# Create a distplot
plt.figure(figsize=(7, 5)) # Set the size of the plot
sns.distplot(ne_tblob_score, bins=30, kde=True, hist_kws={'edgecolor':'black'})
# Customize the plot
plt.title('Distribution of tblob Polarity Scores from Negative Sentiment')
plt.xlabel('Compound Score')
plt.ylabel('Density')
plt.show()
2. Sentiment Overtime¶
Year¶
In [76]:
# Group by year and month, and calculate the average sentiment score for each month
yearly_sentiment = news_ne.groupby('year')['tblob_score'].mean().reset_index()
yearly_sentiment.columns = ['Year', 'Average_Sentiment']
In [77]:
yearly_sentiment.head()
Out[77]:
| Year | Average_Sentiment | |
|---|---|---|
| 0 | 2020 | -0.068832 |
| 1 | 2021 | -0.069226 |
| 2 | 2022 | -0.055172 |
| 3 | 2023 | -0.041761 |
In [78]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=yearly_sentiment, x='Year', y='Average_Sentiment', marker='o')
# Customize the plot
plt.title('Yearly Average Sentiment Trend from Negative Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(yearly_sentiment['Year']) # Ensure all years are shown as x-ticks
# Show the plot
plt.show()
In [79]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_ne, x='year', y='tblob_score', marker='o')
# Customize the plot
plt.title('Yearly Average Sentiment Over Time from Negative Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
Month¶
In [80]:
# Group by year and month, and calculate the average sentiment score for each month
monthly_sentiment = news_ne.groupby(['year', 'month'])['tblob_score'].mean().reset_index()
monthly_sentiment.columns = ['Year', 'Month', 'Average_Sentiment']
In [81]:
monthly_sentiment.head()
Out[81]:
| Year | Month | Average_Sentiment | |
|---|---|---|---|
| 0 | 2020 | 1 | -0.062309 |
| 1 | 2020 | 2 | -0.065608 |
| 2 | 2020 | 3 | -0.068339 |
| 3 | 2020 | 4 | -0.065841 |
| 4 | 2020 | 5 | -0.068862 |
In [82]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Create a larger figure size to prevent overlapping
plt.figure(figsize=(20, 10))
# Create a line plot with the custom color palette
sns.lineplot(x='Month', y='Average_Sentiment', hue='Year', data=monthly_sentiment, marker='o', palette=custom_colors)
# Add titles and labels
plt.title('Monthly Average Sentiment by Year from Negative Sentiment')
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) # Month labels from 1 to 12
# Move the legend outside of the plot
plt.legend(title='Year', bbox_to_anchor=(1.02, 1.02), loc='upper left')
# Adjust subplot parameters for better layout
# plt.subplots_adjust(right=0.8)
# Show the plot
plt.show()
In [83]:
monthly_sentiment['Year_Month'] = monthly_sentiment['Year'].astype(str).str.zfill(2) + '-' + monthly_sentiment['Month'].astype(str).str.zfill(2)
In [84]:
monthly_sentiment.head()
Out[84]:
| Year | Month | Average_Sentiment | Year_Month | |
|---|---|---|---|---|
| 0 | 2020 | 1 | -0.062309 | 2020-01 |
| 1 | 2020 | 2 | -0.065608 | 2020-02 |
| 2 | 2020 | 3 | -0.068339 | 2020-03 |
| 3 | 2020 | 4 | -0.065841 | 2020-04 |
| 4 | 2020 | 5 | -0.068862 | 2020-05 |
In [85]:
# Convert 'Year_Month' to a datetime format for better plotting
monthly_sentiment['Year_Month'] = pd.to_datetime(monthly_sentiment['Year_Month'])
In [86]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=monthly_sentiment, x='Year_Month', y='Average_Sentiment', marker='o')
# Customize the plot
plt.title('Monthly Average Sentiment Over Time from Negative Sentiment', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
In [87]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_po, x='month', y='tblob_score', marker='o')
# Customize the plot
plt.title('Monthly Average Sentiment from Negative Sentiment', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
Day¶
In [88]:
daily_sentiment = news_ne.groupby(['year', 'month', 'day'])['tblob_score'].mean().reset_index()
daily_sentiment.columns = ['Year', 'Month', 'Day', 'Average_Sentiment']
In [89]:
daily_sentiment.head()
Out[89]:
| Year | Month | Day | Average_Sentiment | |
|---|---|---|---|---|
| 0 | 2020 | 1 | 1 | -0.082687 |
| 1 | 2020 | 1 | 2 | -0.039064 |
| 2 | 2020 | 1 | 3 | -0.081845 |
| 3 | 2020 | 1 | 4 | -0.040412 |
| 4 | 2020 | 1 | 5 | -0.041304 |
In [90]:
daily_sentiment['Month_Day'] = daily_sentiment['Month'].astype(str).str.zfill(2) + '-' + daily_sentiment['Day'].astype(str).str.zfill(2)
In [91]:
daily_sentiment.head()
Out[91]:
| Year | Month | Day | Average_Sentiment | Month_Day | |
|---|---|---|---|---|---|
| 0 | 2020 | 1 | 1 | -0.082687 | 01-01 |
| 1 | 2020 | 1 | 2 | -0.039064 | 01-02 |
| 2 | 2020 | 1 | 3 | -0.081845 | 01-03 |
| 3 | 2020 | 1 | 4 | -0.040412 | 01-04 |
| 4 | 2020 | 1 | 5 | -0.041304 | 01-05 |
In [92]:
# Custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Set the style to white (no grid)
sns.set(style="white")
# Create a line plot with a larger figure size
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment, x='Month_Day', y='Average_Sentiment', hue='Year', palette=custom_colors)
# Customize the plot
plt.title('Daily Average Sentiment Trend By Year from Negative Sentiment', fontsize=16)
plt.xlabel('Month-Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# Improve x-tick readability
# Show only the first day of each month or every few days
x_ticks = daily_sentiment['Month_Day'].unique()[::10] # Adjust the step as needed
plt.xticks(x_ticks, rotation=90) # Rotate x-ticks for better readability
# Place the legend outside the plot
plt.legend(title='Year', bbox_to_anchor=(1.01, 1.01), loc='upper left')
# Adjust subplot parameters for better layout
plt.subplots_adjust(right=0.8)
# Show the plot
plt.show()
In [93]:
daily_sentiment2 = news_ne.groupby('date')['tblob_score'].mean().reset_index()
daily_sentiment2.columns = ['Date', 'Average_Sentiment']
In [94]:
daily_sentiment2.head()
Out[94]:
| Date | Average_Sentiment | |
|---|---|---|
| 0 | 2020-01-01 | -0.082687 |
| 1 | 2020-01-02 | -0.039064 |
| 2 | 2020-01-03 | -0.081845 |
| 3 | 2020-01-04 | -0.040412 |
| 4 | 2020-01-05 | -0.041304 |
In [95]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(20, 10))
sns.lineplot(data=daily_sentiment2, x='Date', y='Average_Sentiment')
# Customize the plot
plt.title('Daily Average Sentiment Over Time from Negative Sentiment', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
x_ticks = daily_sentiment2['Date'].unique()[::30] # Adjust the step as needed
plt.xticks(x_ticks, rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
In [96]:
# Set the style
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=news_ne, x='day', y='tblob_score', marker='o')
# Customize the plot
plt.title('Daily Average Sentiment from Negative Sentiment', fontsize=16)
plt.xlabel('Day', fontsize=14)
plt.ylabel('Average Sentiment', fontsize=14)
# plt.xticks(rotation=90) # Rotate x-ticks for better readability
# Show the plot
plt.show()
3-(B). Sentiment over time: Article Numbers¶
In [97]:
news.groupby('year')['tblob_score'].count()
Out[97]:
year 2020 22836 2021 28962 2022 36775 2023 109491 Name: tblob_score, dtype: int64
In [98]:
grouped_data_po = news_po.groupby('year')['tblob_score'].size().reset_index(name = 'count')
In [99]:
grouped_data_po.head()
Out[99]:
| year | count | |
|---|---|---|
| 0 | 2020 | 18884 |
| 1 | 2021 | 25638 |
| 2 | 2022 | 34741 |
| 3 | 2023 | 104456 |
In [100]:
# Set the style
sns.set(style="white")
# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data_po, x='year', y='count')
# Customize the plot
plt.title('News Article Count(Yearly) from Positive Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)
# Show the plot
plt.show()
In [101]:
grouped_data_ne = news_ne.groupby('year')['tblob_sent'].size().reset_index(name = 'count')
In [102]:
grouped_data_ne.head()
Out[102]:
| year | count | |
|---|---|---|
| 0 | 2020 | 3615 |
| 1 | 2021 | 3269 |
| 2 | 2022 | 1941 |
| 3 | 2023 | 4885 |
In [103]:
# Set the style
sns.set(style="white")
# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(data=grouped_data_ne, x='year', y='count')
# Customize the plot
plt.title('News Article Count(Yearly) from Negative Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)
# Show the plot
plt.show()
In [104]:
# Create a pivot table
pivot_data = news.pivot_table(index='year', columns='tblob_sent', aggfunc='size', fill_value=0)
In [105]:
pivot_data.head()
Out[105]:
| tblob_sent | negative | neutral | positive |
|---|---|---|---|
| year | |||
| 2020 | 3615 | 337 | 18884 |
| 2021 | 3269 | 55 | 25638 |
| 2022 | 1941 | 93 | 34741 |
| 2023 | 4885 | 150 | 104456 |
In [106]:
sns.set(style="white")
# Create a line plot
plt.figure(figsize=(10, 5))
sns.lineplot(data=pivot_data, markers=True, dashes=False)
# Customize the plot
plt.title('Yearly News Article Count by Sentiment', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count', fontsize=14)
# Place the legend outside of the plot to the right
plt.legend(title='Sentiment', loc='upper left', bbox_to_anchor=(1.01, 1.02))
# Adjust subplot parameters to fit the legend
plt.subplots_adjust(right=0.75)
# Show the plot
plt.show()
In [107]:
# Combine year and month into a single column
news['year_month'] = news['year'].astype(str) + '-' + news['month'].astype(str).str.zfill(2)
grouped_data = news.groupby(['year_month', 'tblob_sent']).size().reset_index(name='count')
In [108]:
# Set the style
sns.set(style="white")
# Define the sentiments
sentiments = ['positive', 'negative', 'neutral']
# Create separate plots for each sentiment
for sentiment in sentiments:
# Filter data for the current sentiment
data_filtered = grouped_data[grouped_data['tblob_sent'] == sentiment]
# Create a bar plot for the current sentiment
plt.figure(figsize=(10, 5))
barplot = sns.barplot(data=data_filtered, x='year_month', y='count')
# Customize the plot
plt.title(f'Monthly Article Count ({sentiment.capitalize()} Sentiment)', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Article Count', fontsize=14)
# Rotate and skim x-ticks
xtick_labels = barplot.get_xticklabels()
skim_factor = 5 # Adjust this value as needed to skip x-ticks
barplot.set_xticklabels([label if i % skim_factor == 0 else '' for i, label in enumerate(xtick_labels)], rotation=90)
# Place the legend outside of the plot
# plt.legend(title='Sentiment', bbox_to_anchor=(1.01, 1.02), loc='upper left')
# Show the plot
plt.show()
In [109]:
# Pivot the data for stacked bar plot
pivot_data = grouped_data.pivot(index='year_month', columns='tblob_sent', values='count').fillna(0)
In [110]:
# Extend the custom color palette
custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c"] # Add more colors as needed
# Create a stacked bar plot with an adjusted figure size
plt.figure(figsize=(20, 10))
pivot_data.plot(kind='bar', stacked=True, color=custom_colors)
# Customize the plot
plt.title('Monthly Total Article Count with Sentiment Portions', fontsize=16)
plt.xlabel('Year-Month', fontsize=14)
plt.ylabel('Total Article Count', fontsize=14)
# Rotate and skim x-ticks
plt.xticks(rotation=90)
xtick_labels = plt.gca().get_xticklabels()
skim_factor = 5 # Adjust this value as needed
plt.gca().set_xticklabels([label if i % skim_factor == 0 else '' for i, label in enumerate(xtick_labels)])
# Place the legend outside of the plot
plt.legend(title='Sentiment', bbox_to_anchor=(1.01, 1.02), loc='upper left')
# Show the plot
plt.show()
<Figure size 2000x1000 with 0 Axes>
4. Word Count¶
4.1. Original Data¶
In [111]:
# Set the style
sns.set(style="white")
# Create a box plot
plt.figure(figsize=(18, 8))
sns.boxplot(data=news, x='tblob_sent', y='text_word_count')
# Customize the plot
plt.title('Word Count Distribution by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Word Count', fontsize=14)
# Show the plot
plt.show()
In [112]:
# Create a violin plot
plt.figure(figsize=(12, 8))
sns.violinplot(data=news, x='tblob_sent', y='text_word_count')
# Customize the plot
plt.title('Word Count Distribution by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Word Count', fontsize=14)
# Show the plot
plt.show()
In [113]:
plt.figure(figsize=(10, 6))
sentiments = news['tblob_sent'].unique() # Get unique sentiment categories
# Define custom colors for each sentiment category
colors = ['green', 'red', 'gray'] # Adjust the number of colors based on the number of sentiment categories
for i, sentiment in enumerate(sentiments):
data = news[news['tblob_sent'] == sentiment] # Filter data for each sentiment category
sns.histplot(data=data, x='text_word_count', label=sentiment, color=colors[i], bins=30, stat='density', element='step')
plt.xlabel('Text Word Count')
plt.ylabel('Density')
plt.title('Distribution of Text Word Count by Sentiment')
plt.legend(title='Sentiment')
plt.show()
4.2. Data without Outliers¶
In [114]:
plt.figure(figsize=(12, 8))
sns.boxplot(data=news, x='tblob_sent', y='text_word_count', showfliers=False)
plt.title('Word Count Distribution by Sentiment (Without Outliers)')
plt.xlabel('Sentiment')
plt.ylabel('Word Count')
plt.show()
In [115]:
plt.figure(figsize=(12, 8))
sns.violinplot(data=news, x='tblob_sent', y='text_word_count', cut=0)
plt.title('Word Count Distribution by Sentiment (Violin Plot)')
plt.xlabel('Sentiment')
plt.ylabel('Word Count')
plt.show()
In [116]:
news[news['tblob_sent'] == 'positive']['text_word_count'].describe()
Out[116]:
count 183719.000000 mean 818.312766 std 619.243402 min 6.000000 25% 490.000000 50% 673.000000 75% 995.000000 max 29325.000000 Name: text_word_count, dtype: float64
In [117]:
news[news['tblob_sent'] == 'negative']['text_word_count'].describe()
Out[117]:
count 13710.000000 mean 680.988330 std 409.065458 min 4.000000 25% 413.000000 50% 642.000000 75% 873.000000 max 10083.000000 Name: text_word_count, dtype: float64
In [118]:
news[news['tblob_sent'] == 'neutral']['text_word_count'].describe()
Out[118]:
count 635.000000 mean 46.858268 std 149.197745 min 3.000000 25% 10.000000 50% 12.000000 75% 15.000000 max 1320.000000 Name: text_word_count, dtype: float64
In [119]:
def calculate_outlier_thresholds(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return lower_bound, upper_bound
In [120]:
# Calculate thresholds for each sentiment category
positive_thresholds = calculate_outlier_thresholds(news[news['tblob_sent'] == 'positive']['text_word_count'])
negative_thresholds = calculate_outlier_thresholds(news[news['tblob_sent'] == 'negative']['text_word_count'])
neutral_thresholds = calculate_outlier_thresholds(news[news['tblob_sent'] == 'neutral']['text_word_count'])
In [121]:
print(positive_thresholds)
print(negative_thresholds)
print(neutral_thresholds)
(-267.5, 1752.5) (-277.0, 1563.0) (2.5, 22.5)
In [122]:
# Set the style
sns.set(style="white")
# Define the figure size
plt.figure(figsize=(10, 6))
# Filter out text_word_count values exceeding 2000
filtered_news = news[news['text_word_count'] <= 2000]
# Define custom colors for each sentiment category
colors = ['green', 'red', 'gray'] # Make sure the number of colors matches the number of sentiment categories
# Get unique sentiment categories
sentiments = filtered_news['tblob_sent'].unique()
# Plot overlapping histograms for each sentiment category
for sentiment, color in zip(sentiments, colors):
# Filter data for each sentiment category
data = filtered_news[filtered_news['tblob_sent'] == sentiment]['text_word_count']
sns.histplot(data, label=sentiment, color=color, element='step', stat='count', common_norm=False, binwidth=50)
# Customize the plot
plt.xlabel('Text Word Count')
plt.ylabel('Article Count')
plt.title('Distribution of Text Word Count by Sentiment (Word Count ≤ 2000)')
# Place the legend outside the plot
plt.legend(title='Sentiment', bbox_to_anchor=(1.01, 1.02), loc='upper left')
# Show the plot
plt.tight_layout() # Adjust the layout
plt.show()
5. Word Cloud¶
In [123]:
from wordcloud import WordCloud
In [124]:
# Function to generate word cloud
def generate_wordcloud(text, title):
wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate(text)
plt.figure(figsize = (10, 5), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title(title, fontsize=20)
plt.show()
In [125]:
# Replace 'positive' with 'negative' or 'neutral' as needed
sentiment_text = " ".join(text for text in news[news['tblob_sent'] == 'positive']['text_lemm'])
# Generate and plot the word cloud
wordcloud_sentiment = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(sentiment_text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_sentiment, interpolation='bilinear')
plt.title('Positive Sentiment Word Cloud', fontsize=20)
plt.axis('off')
plt.show()
In [126]:
sentiment_text = " ".join(text for text in news[news['tblob_sent'] == 'negative']['text_lemm'])
# Generate and plot the word cloud
wordcloud_sentiment = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(sentiment_text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_sentiment, interpolation='bilinear')
plt.title('Negative Sentiment Word Cloud', fontsize=20)
plt.axis('off')
plt.show()
In [127]:
sentiment_text = " ".join(text for text in news[news['tblob_sent'] == 'neutral']['text_lemm'])
# Generate and plot the word cloud
wordcloud_sentiment = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(sentiment_text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud_sentiment, interpolation='bilinear')
plt.title('Neutral Sentiment Word Cloud', fontsize=20)
plt.axis('off')
plt.show()